In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [4]:
df = pd.read_csv('SpotifyFeatures.csv')
df
Out[4]:
genre artist_name track_name track_id popularity acousticness danceability duration_ms energy instrumentalness key liveness loudness mode speechiness tempo time_signature valence
0 Movie Henri Salvador C'est beau de faire un Show 0BRjO6ga9RKCKjfDqeFgWV 0 0.61100 0.389 99373 0.910 0.000000 C# 0.3460 -1.828 Major 0.0525 166.969 4/4 0.814
1 Movie Martin & les fées Perdu d'avance (par Gad Elmaleh) 0BjC1NfoEOOusryehmNudP 1 0.24600 0.590 137373 0.737 0.000000 F# 0.1510 -5.559 Minor 0.0868 174.003 4/4 0.816
2 Movie Joseph Williams Don't Let Me Be Lonely Tonight 0CoSDzoNIKCRs124s9uTVy 3 0.95200 0.663 170267 0.131 0.000000 C 0.1030 -13.879 Minor 0.0362 99.488 5/4 0.368
3 Movie Henri Salvador Dis-moi Monsieur Gordon Cooper 0Gc6TVm52BwZD07Ki6tIvf 0 0.70300 0.240 152427 0.326 0.000000 C# 0.0985 -12.178 Major 0.0395 171.758 4/4 0.227
4 Movie Fabien Nataf Ouverture 0IuslXpMROHdEPvSl1fTQK 4 0.95000 0.331 82625 0.225 0.123000 F 0.2020 -21.150 Major 0.0456 140.576 4/4 0.390
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
232720 Soul Slave Son Of Slide 2XGLdVl7lGeq8ksM6Al7jT 39 0.00384 0.687 326240 0.714 0.544000 D 0.0845 -10.626 Major 0.0316 115.542 4/4 0.962
232721 Soul Jr Thomas & The Volcanos Burning Fire 1qWZdkBl4UVPj9lK6HuuFM 38 0.03290 0.785 282447 0.683 0.000880 E 0.2370 -6.944 Minor 0.0337 113.830 4/4 0.969
232722 Soul Muddy Waters (I'm Your) Hoochie Coochie Man 2ziWXUmQLrXTiYjCg2fZ2t 47 0.90100 0.517 166960 0.419 0.000000 D 0.0945 -8.282 Major 0.1480 84.135 4/4 0.813
232723 Soul R.LUM.R With My Words 6EFsue2YbIG4Qkq8Zr9Rir 44 0.26200 0.745 222442 0.704 0.000000 A 0.3330 -7.137 Major 0.1460 100.031 4/4 0.489
232724 Soul Mint Condition You Don't Have To Hurt No More 34XO9RwPMKjbvRry54QzWn 35 0.09730 0.758 323027 0.470 0.000049 G# 0.0836 -6.708 Minor 0.0287 113.897 4/4 0.479

232725 rows × 18 columns

Exploratory Data Analysis and Visualization¶

In [3]:
df.describe()
Out[3]:
popularity acousticness danceability duration_ms energy instrumentalness liveness loudness speechiness tempo valence
count 232725.000000 232725.000000 232725.000000 2.327250e+05 232725.000000 232725.000000 232725.000000 232725.000000 232725.000000 232725.000000 232725.000000
mean 41.127502 0.368560 0.554364 2.351223e+05 0.570958 0.148301 0.215009 -9.569885 0.120765 117.666585 0.454917
std 18.189948 0.354768 0.185608 1.189359e+05 0.263456 0.302768 0.198273 5.998204 0.185518 30.898907 0.260065
min 0.000000 0.000000 0.056900 1.538700e+04 0.000020 0.000000 0.009670 -52.457000 0.022200 30.379000 0.000000
25% 29.000000 0.037600 0.435000 1.828570e+05 0.385000 0.000000 0.097400 -11.771000 0.036700 92.959000 0.237000
50% 43.000000 0.232000 0.571000 2.204270e+05 0.605000 0.000044 0.128000 -7.762000 0.050100 115.778000 0.444000
75% 55.000000 0.722000 0.692000 2.657680e+05 0.787000 0.035800 0.264000 -5.501000 0.105000 139.054000 0.660000
max 100.000000 0.996000 0.989000 5.552917e+06 0.999000 0.999000 1.000000 3.744000 0.967000 242.903000 1.000000
In [21]:
plt.figure(figsize=(12,10))
sns.histplot(data=df, x='genre',color='c')
plt.xticks(rotation=70);
In [70]:
plt.figure(figsize=(12,10))
sns.histplot(data=df, x='key')
Out[70]:
<AxesSubplot:xlabel='key', ylabel='Count'>
In [22]:
fig = plt.figure(figsize = (15,20));
ax = fig.gca();
df.hist(ax = ax,bins=10);
C:\Users\annam\AppData\Local\Temp\ipykernel_31132\2283546626.py:3: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared.
  df.hist(ax = ax,bins=10);
In [60]:
df.columns
Out[60]:
Index(['genre', 'artist_name', 'track_name', 'track_id', 'popularity',
       'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence'],
      dtype='object')

Top 15 songs¶

In [51]:
df.sort_values('popularity',ascending=False)[['track_name','artist_name','popularity']].drop_duplicates()[0:15]
Out[51]:
track_name artist_name popularity
9027 7 rings Ariana Grande 100
86951 Wow. Post Malone 99
107802 break up with your girlfriend, i'm bored Ariana Grande 99
66643 Con Calma Daddy Yankee 98
107829 Sweet but Psycho Ava Max 97
86953 Sunflower - Spider-Man: Into the Spider-Verse Post Malone 97
107875 Calma - Remix Pedro Capó 97
92824 Without Me Halsey 97
107812 Happier Marshmello 97
107810 Dancing With A Stranger (with Normani) Sam Smith 97
107851 Taki Taki (with Selena Gomez, Ozuna & Cardi B) DJ Snake 96
107830 Shallow Lady Gaga 96
86952 MIDDLE CHILD J. Cole 96
138916 Secreto Anuel Aa 96
138917 Baila Baila Baila Ozuna 95

Top Genres¶

In [24]:
order = df.groupby(["genre"])["popularity"].mean().sort_values(ascending=False).index
sns.catplot(data=df, x='genre', y='popularity', height=5, aspect=2, kind='box',order=order)
plt.xticks(rotation=70);
In [25]:
for col in df.select_dtypes(include=['object','category']):
    print('%s\n%s\n' % (df[col].name, df[col].unique()))
genre
['Movie' 'R&B' 'A Capella' 'Alternative' 'Country' 'Dance' 'Electronic'
 'Anime' 'Folk' 'Blues' 'Opera' 'Hip-Hop' "Children's Music"
 'Children’s Music' 'Rap' 'Indie' 'Classical' 'Pop' 'Reggae' 'Reggaeton'
 'Jazz' 'Rock' 'Ska' 'Comedy' 'Soul' 'Soundtrack' 'World']

artist_name
['Henri Salvador' 'Martin & les fées' 'Joseph Williams' ... 'Dharmasoul'
 'Swim' 'Jr Thomas & The Volcanos']

track_name
["C'est beau de faire un Show" "Perdu d'avance (par Gad Elmaleh)"
 "Don't Let Me Be Lonely Tonight" ... 'P.O.P.' 'Burning Fire'
 "You Don't Have To Hurt No More"]

track_id
['0BRjO6ga9RKCKjfDqeFgWV' '0BjC1NfoEOOusryehmNudP'
 '0CoSDzoNIKCRs124s9uTVy' ... '2iZf3EUedz9MPqbAvXdpdA'
 '1qWZdkBl4UVPj9lK6HuuFM' '34XO9RwPMKjbvRry54QzWn']

key
['C#' 'F#' 'C' 'F' 'G' 'E' 'D#' 'G#' 'D' 'A#' 'A' 'B']

mode
['Major' 'Minor']

time_signature
['4/4' '5/4' '3/4' '1/4' '0/4']

Headmap¶

In [26]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), cmap='twilight_shifted',annot=True,)
Out[26]:
<AxesSubplot:>

Feature engineering¶

In [16]:
df['popularity1'] = df.apply(lambda x: 0 if x['popularity']<=45 else 1 if x['popularity']<=60 else 2 , axis=1)
In [17]:
sns.countplot(df['popularity1'],palette='pastel').set_xticklabels(['0-45', '46-60','61 & more']);
C:\Users\annam\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [18]:
X0 = df[['genre','acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence']]
In [19]:
y = df['popularity1']
In [20]:
X = pd.get_dummies(X0)
X.head()
Out[20]:
acousticness danceability duration_ms energy instrumentalness liveness loudness speechiness tempo valence ... key_F# key_G key_G# mode_Major mode_Minor time_signature_0/4 time_signature_1/4 time_signature_3/4 time_signature_4/4 time_signature_5/4
0 0.611 0.389 99373 0.910 0.000 0.3460 -1.828 0.0525 166.969 0.814 ... 0 0 0 1 0 0 0 0 1 0
1 0.246 0.590 137373 0.737 0.000 0.1510 -5.559 0.0868 174.003 0.816 ... 1 0 0 0 1 0 0 0 1 0
2 0.952 0.663 170267 0.131 0.000 0.1030 -13.879 0.0362 99.488 0.368 ... 0 0 0 0 1 0 0 0 0 1
3 0.703 0.240 152427 0.326 0.000 0.0985 -12.178 0.0395 171.758 0.227 ... 0 0 0 1 0 0 0 0 1 0
4 0.950 0.331 82625 0.225 0.123 0.2020 -21.150 0.0456 140.576 0.390 ... 0 0 0 1 0 0 0 0 1 0

5 rows × 56 columns

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [23]:
scaler = StandardScaler()
In [24]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Pipeline, GridSearchCV & Modelling¶

In [107]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
In [504]:
pipe = Pipeline([
    ("classifier", RandomForestClassifier())])
In [511]:
sp = [
    {"classifier": [RandomForestClassifier(),xgb.XGBClassifier()],
    "classifier__n_estimators": [100,120]},
    {"classifier": [LogisticRegression()],
     "classifier__C": [np.logspace(0,4,10)]}
    ]
In [512]:
grid = GridSearchCV(pipe, sp, cv=3, verbose=0)
In [513]:
ml_g = grid.fit(X_train,y_train)
C:\Users\annam\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:372: FitFailedWarning: 
3 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\annam\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\annam\anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\annam\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1464, in fit
    raise ValueError("Penalty term must be positive; got (C=%r)" % self.C)
ValueError: Penalty term must be positive; got (C=array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]))

  warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\annam\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:969: UserWarning: One or more of the test scores are non-finite: [0.75956064 0.75983994 0.76597916 0.76766033        nan]
  warnings.warn(
In [514]:
ml_g.best_params_
Out[514]:
{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               n_estimators=120, n_jobs=None, num_parallel_tree=None,
               predictor=None, random_state=None, ...),
 'classifier__n_estimators': 120}

Prediction¶

In [516]:
p2 = ml_g.predict(X_test)
accuracy_score(y_test,p2)
Out[516]:
0.7672145235793318
In [181]:
y_train.value_counts()
Out[181]:
0    102193
1     58057
2     25930
Name: popularity1, dtype: int64
In [178]:
ml3 = LogisticRegression(solver='sag')
ml3.fit(X_train,y_train)
p3 = ml3.predict(X_test)
accuracy_score(p3,y_test)
C:\Users\annam\anaconda3\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(
Out[178]:
0.7604683639488667

Pipeline ¬ PCA + artist_name in X¶

In [183]:
from sklearn.pipeline import FeatureUnion
In [184]:
from sklearn.decomposition import PCA
In [249]:
df_a = df.sample(frac=0.01)
In [433]:
X0_a = df_a[['artist_name','genre','acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence']]
In [434]:
y_a = df_a['popularity1']
In [435]:
X_a = pd.get_dummies(X0_a)

X_a.head()
Out[435]:
acousticness danceability duration_ms energy instrumentalness liveness loudness speechiness tempo valence ... key_F key_F# key_G key_G# mode_Major mode_Minor time_signature_1/4 time_signature_3/4 time_signature_4/4 time_signature_5/4
60497 0.487 0.511 344320 0.391 0.000000 0.1120 -10.850 0.0443 112.196 0.580 ... 0 0 1 0 1 0 0 0 1 0
46728 0.167 0.481 189293 0.689 0.001530 0.7390 -11.629 0.2140 144.851 0.782 ... 1 0 0 0 1 0 0 0 1 0
50849 0.388 0.770 215493 0.565 0.000179 0.0846 -7.185 0.0333 114.069 0.266 ... 0 0 0 0 1 0 0 0 1 0
45140 0.264 0.458 235495 0.930 0.000353 0.2250 -4.303 0.0983 170.907 0.583 ... 0 0 0 1 1 0 0 0 1 0
25773 0.566 0.706 304333 0.520 0.857000 0.1580 -7.496 0.0290 108.994 0.354 ... 0 0 0 1 0 1 0 0 1 0

5 rows × 1746 columns

In [442]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_a, y_a, test_size=0.2, random_state=42)

Preprocessing - std & PCA¶

In [443]:
preprocess = FeatureUnion([("std", StandardScaler()), ("pca",PCA())])

Pipeline¶

In [444]:
pipe_pca = Pipeline([
    ("preprocess", preprocess),
    ("classifier", xgb.XGBClassifier())])
In [471]:
search_p = [{"preprocess__pca__n_components": [100],
            "classifier__n_estimators": [120]}]
In [472]:
grid_pca = GridSearchCV(pipe_pca, param_grid=search_p, cv=3)
In [473]:
mg2 = grid_pca.fit(X_train1,y_train1)
In [474]:
mg2.best_params_
Out[474]:
{'classifier__n_estimators': 120, 'preprocess__pca__n_components': 100}

Prediction¶

In [475]:
pp = mg2.predict(X_test1)
accuracy_score(y_test1,pp)
Out[475]:
0.7253218884120172
In [424]:
mg2.best_estimator_.named_steps['preprocess']
Out[424]:
FeatureUnion(transformer_list=[('std', StandardScaler()),
                               ('pca', PCA(n_components=500))])
In [426]:
mg2.best_estimator_.get_params()['preprocess__transformer_list']
Out[426]:
[('std', StandardScaler()), ('pca', PCA(n_components=500))]

Pipeline (+ artist_name in X) ¬ without PCA¶

In [476]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_a, y_a, test_size=0.2, random_state=42)
In [477]:
preprocess_no_pca = FeatureUnion([("std", StandardScaler())])
In [500]:
pipe_no_pca = Pipeline([
    ("preprocess_no_pca", preprocess_no_pca),
    ("classifier", xgb.XGBClassifier())])
search_no_p = [{#"preprocess__pca__n_components": [100],
            "classifier__n_estimators": [120]}]
In [501]:
grid_no_pca = GridSearchCV(pipe_no_pca, param_grid=search_no_p, cv=3)
In [502]:
mg_npca = grid_no_pca.fit(X_train1,y_train1)

Prediction¶

In [503]:
pp1 = mg_npca.predict(X_test1)
accuracy_score(y_test1,pp1)
Out[503]:
0.6974248927038627

Clustering¶

In [7]:
from sklearn.cluster import AgglomerativeClustering
In [8]:
sample1 = df.sample(frac=0.01)
In [9]:
sample2 = sample1[['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity']]
In [26]:
sample2_s = scaler.fit_transform(sample2)

AgglomerativeClustering¶

In [27]:
cluAgg = AgglomerativeClustering()
In [28]:
cluAgg_m = cluAgg.fit(sample2_s)
In [29]:
sample1['cluAgg'] = cluAgg_m.labels_
sample2['cluAgg'] = cluAgg_m.labels_
C:\Users\annam\AppData\Local\Temp\ipykernel_33544\130059744.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample2['cluAgg'] = cluAgg_m.labels_
In [30]:
import plotly.express as px
In [31]:
fig = px.scatter_3d(sample1, x='acousticness', y='danceability', z='popularity',
              color='cluAgg', size='popularity', size_max=30,hover_data=sample1,
              symbol='cluAgg', opacity=1)

# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
In [44]:
fig = plt.figure(figsize = (15,3));


order = (sample1[sample1.cluAgg==0]).groupby(["genre"])["genre"].count().sort_values(ascending=False).index

sns.countplot(data=sample1[sample1.cluAgg==0],x='genre',order=order);
plt.xticks(rotation=70);
plt.title("Agglomerative Clustering - 0")
Out[44]:
Text(0.5, 1.0, 'Agglomerative Clustering - 0')
In [45]:
fig = plt.figure(figsize = (15,3));


order = (sample1[sample1.cluAgg==1]).groupby(["genre"])["genre"].count().sort_values(ascending=False).index

sns.countplot(data=sample1[sample1.cluAgg==1],x='genre',order=order);
plt.xticks(rotation=70);
plt.title("Agglomerative Clustering - 1")
Out[45]:
Text(0.5, 1.0, 'Agglomerative Clustering - 1')

KMeans¶

In [34]:
from sklearn.cluster import KMeans
In [35]:
clus_k = KMeans(2)
In [36]:
clus_k_m = clus_k.fit(sample2_s)
In [37]:
sample1['kmeans'] = clus_k_m.labels_
sample1['kmeans'] = clus_k_m.labels_
In [39]:
fig = px.scatter_3d(sample1, x='acousticness', y='danceability', z='popularity',
              color='kmeans', size='popularity', size_max=30,hover_data=sample1,
              symbol='kmeans', opacity=1)

# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
In [46]:
fig = plt.figure(figsize = (15,3));

order = (sample1[sample1.kmeans==0]).groupby(["genre"])["genre"].count().sort_values(ascending=False).index

sns.countplot(data=sample1[sample1.kmeans==0],x='genre',order=order);
plt.xticks(rotation=70);
plt.title("Kmeans Clustering - 0")
Out[46]:
Text(0.5, 1.0, 'Kmeans Clustering - 0')
In [47]:
fig = plt.figure(figsize = (15,3));

order = (sample1[sample1.kmeans==1]).groupby(["genre"])["genre"].count().sort_values(ascending=False).index

sns.countplot(data=sample1[sample1.kmeans==1],x='genre',order=order);
plt.xticks(rotation=70);
plt.title("Kmeans Clustering - 1")
Out[47]:
Text(0.5, 1.0, 'Kmeans Clustering - 1')
In [ ]:
 
In [ ]: